We’re going to walk through an example workflow for the scABC package, walking through all the individual steps of the analysis. We’ll use the 6 cell line mixture for an example. The 1632 cells can be downloaded from GEO under accession number GSE65360 (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE65360).
We used Kundaje pipeline for aligning pair ended reads to hg19 and removing duplicates (https://github.com/kundajelab/atac_dnase_pipelines) with the command below for each read pair file.
bds_scr [SCR_NAME] atac.bds -align -species hg19 -species_file [SPECIES_FILE_PATH] -nth [NUM_THREADS] -fastq1_1 [READ_PAIR1] -fastq1_2 [READ_PAIR2]
For details on the pipeline, visit the Kundaje lab website or github page.
The resulting bam files were merged using samtools.
samtools merge [AGGREGATE_BAM] *.trim.PE2SE.nodup.bam
The merged bam file was then used as input into MACS2 to call merged peaks for later analysis, using the Kundaje pipeline again.
bds_scr [SCR_NAME] atac.bds -species hg19 -species_file [SPECIES_FILE_PATH] -nth [NUM_THREADS] -se -filt_bam [AGGREGATE_BAM]
Several of the cells have multiple runs. We merged these runs using samtools.
We will use scABC to analyse the single cell data. The cell level bam files and their samtools indexes are contained in the folder bams.
# always run
setwd("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/")
library(devtools)
devtools::install_github("timydaley/scABC", force = TRUE)
## Downloading GitHub repo timydaley/scABC@master
## from URL https://api.github.com/repos/timydaley/scABC/zipball/master
## Installing scABC
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/8m/0f0r_lmn4p5bmrzymmp2d0qr0000gn/T/RtmpGMxBvQ/devtools3b4d41d71f03/timydaley-scABC-d2432f3' \
## --library='/Library/Frameworks/R.framework/Versions/3.4/Resources/library' \
## --install-tests
##
library(scABC)
# table that has experiment info
bamfile.table = read.table(file = "NoTreatment/6Lines/SRX&Type&Batch.txt")
head(bamfile.table)
## V1 V2 V3
## 1 SRX859035 H1ESC singles-H1ESC-well-1
## 2 SRX859036 H1ESC singles-H1ESC-well-2
## 3 SRX859037 H1ESC singles-H1ESC-well-3
## 4 SRX859038 H1ESC singles-H1ESC-well-4
## 5 SRX859039 H1ESC singles-H1ESC-well-5
## 6 SRX859040 H1ESC singles-H1ESC-well-6
bamfiles = paste0("bams/", bamfile.table[,1], ".bam")
peaks = selectPeaks("NoTreatment/6Lines/mergeAll.tn5.pf.gappedPeak")
dim(peaks)
## [1] 242875 15
Computing the Forground and Background matrix takes quite a bit of time. We can do this once and cache the results for later.
#
ForeGround = getCountsMatrix(bamfiles, peaks);
#
ForeGroundFiltered = filterPeaks(ForeGround$ForeGroundMatrix, peaks)
peaks = ForeGroundFiltered$peaks
#
BackGround = getBackground(bamfiles, peaks)
ForeGroundBackGroundFiltered = filterSamples(ForeGround = ForeGroundFiltered$ForeGroundMatrix, BackGround = BackGround$BackGroundMatrix)
InSilicoForeGround = ForeGroundBackGroundFiltered$ForeGroundMatrix
InSilicoBackGround = ForeGroundBackGroundFiltered$BackGroundMatrix
InSilicoPeaks = peaks
Now let’s compute the landmarks and take a look at them.
#
InSilicoLandMarks = computeLandmarks(ForeGround = InSilicoForeGround,
BackGround = InSilicoBackGround,
nCluster = 6, lambda = 1, nTop = 2000)
# look at correlation between landmarks
cor(InSilicoLandMarks, InSilicoLandMarks, method = 'spearman')
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000000 0.5499138 0.4748952 0.4364145 0.4880086 0.4505189
## [2,] 0.5499138 1.0000000 0.5635555 0.4994078 0.6456894 0.4927121
## [3,] 0.4748952 0.5635555 1.0000000 0.4953581 0.5454351 0.4370819
## [4,] 0.4364145 0.4994078 0.4953581 1.0000000 0.4831819 0.4262173
## [5,] 0.4880086 0.6456894 0.5454351 0.4831819 1.0000000 0.4520639
## [6,] 0.4505189 0.4927121 0.4370819 0.4262173 0.4520639 1.0000000
It looks like the cluster landmarks are well seperated. Let’s look at the cell to cluster assignments.
# assign cells to the closest landmark #
InSilicoLandMarkAssignments = assign2landmarks(InSilicoForeGround, InSilicoLandMarks)
InSilicoCell2LandmarkCorrelation = cbind(apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,1], method = 'spearman')),
apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,2], method = 'spearman')),
apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,3], method = 'spearman')),
apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,4], method = 'spearman')),
apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,5], method = 'spearman')),
apply(InSilicoForeGround, 2, function(x) cor(x, InSilicoLandMarks[,6], method = 'spearman')))
cell.info = bamfile.table[which(paste0(bamfile.table[,1], ".bam") %in% colnames(InSilicoForeGround)), 2]
library(gplots)
library(RColorBrewer)
library(devtools)
source_url("https://raw.githubusercontent.com/obigriffith/biostar-tutorials/master/Heatmaps/heatmap.3.R")
## SHA-1 hash of file is 015fc0457e61e3e93a903e69a24d96d2dac7b9fb
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoLandMarkAssignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
heatmap.3(InSilicoCell2LandmarkCorrelation, dendrogram='none', Rowv=FALSE, Colv=FALSE,
trace='none', col = scalered, margin = c(5, 5), density.info = "none",
RowSideColors = rowcols, RowSideColorsSize=2, symm=F,symkey=F,
symbreaks=F, scale="none")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)), col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
heatmap.2(InSilicoCell2LandmarkCorrelation, dendrogram='none', Rowv=FALSE,
Colv=FALSE, trace='none', col = scalered, margin = c(5, 5),
density.info = "none", RowSideColors = rowcols1, symm=F,
symkey=F,symbreaks=F, scale="none")
legend("bottomleft", legend = c(unique(cell.info)), col = rcols1, border=FALSE,
bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
To view the above figure better, we’re normalize the correlations of each cell so that the maximum is 1.
# for prettyness
for(i in 1:dim(InSilicoCell2LandmarkCorrelation)[1]){
InSilicoCell2LandmarkCorrelation[i,] = InSilicoCell2LandmarkCorrelation[i,]/mean(InSilicoCell2LandmarkCorrelation[i,])
}
library(gplots)
library(RColorBrewer)
heatmap.2(InSilicoCell2LandmarkCorrelation, dendrogram='none', Rowv=FALSE, Colv=FALSE, trace='none', col = scalered, margin = c(5, 5), density.info = "none",
RowSideColors = rowcols1, symm=F,symkey=F,symbreaks=F, scale="none")
legend("bottomleft", legend = c(unique(cell.info)), col = rcols1, border=FALSE,
bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
The clustering correctly classifies all cells except for 1. This assumes we knew the number of clusters. What if we let scABC choose the number of clusters?
#
InSilicoBackGroundMedian = apply(InSilicoBackGround, 2, median)
InSilicoGapStat = getGapStat(InSilicoForeGround, InSilicoBackGroundMedian,
nClusters=1:10, nPerm = 20, quiet = TRUE)
InSilicoGapStat$nClusterOptimal
## [1] 6
plotGapStat(InSilicoGapStat, nClusters = 1:10, main = "In Silico Gap Stat")
## [1] 1
scABC chooses 6 clusters and we can continue with this number of clusters.
#
InSilicoPeakSelection = getClusterSpecificPvalue(ForeGround=InSilicoForeGround, cluster_assignments = InSilicoLandMarkAssignments, background_medians = InSilicoBackGroundMedian)
##
## Estimating beta MLE
##
0 % converged
0 % converged
0 % converged
0 % converged
0 % converged
0 % converged
2 % converged
6 % converged
10 % converged
22 % converged
32 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
35 % converged
37 % converged
49 % converged
91 % converged
99 % converged
100 % converged
## Estimating beta MAP
##
0 % converged
0 % converged
0 % converged
0 % converged
0 % converged
0 % converged
0 % converged
3 % converged
7 % converged
19 % converged
77 % converged
100 % converged
100 % converged
100 % converged
## Calculating the p-values
InSilicoPeakPvals = InSilicoPeakSelection$pvalue
head(InSilicoPeakPvals)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## [1,] 1.000000e+00 1.0000000 1.000000e+00 2.325067e-48 1.0000000 1.0000000
## [2,] 3.134483e-07 0.9999997 1.000000e+00 1.000000e+00 1.0000000 1.0000000
## [3,] 1.000000e+00 1.0000000 0.000000e+00 1.000000e+00 1.0000000 1.0000000
## [4,] 9.701213e-01 0.9621131 3.788687e-02 9.996608e-01 0.9948569 0.9974671
## [5,] 9.994836e-01 0.9870934 5.277733e-02 9.472227e-01 0.9999959 0.9975895
## [6,] 9.999988e-01 0.9999994 4.079405e-06 9.999992e-01 0.9999962 0.9999959
Let’s visualize the data.
# all peaks
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoLandMarkAssignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
x = t(InSilicoForeGround)
x = x[ ,head(order(rowSums(InSilicoForeGround), decreasing = TRUE), 10000)]
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
#truncate to see patterns
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=col.clus, trace='none', col = scalered,
margin = c(5, 5), density.info = "none", RowSideColors = rowcols,
RowSideColorsSize=2, main = "All peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)),
col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
We’ll select the cluster specific peaks by selecting the 10,000 peaks with the smallest p-values across all clusters.
# cluster specific peaks
ClusterSpecificPeaks = head(order(apply(InSilicoPeakPvals, 1, min),
decreasing = FALSE), 10000)
length(ClusterSpecificPeaks)
## [1] 10000
x = t(InSilicoForeGround[ClusterSpecificPeaks, ])
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
#truncate to see patterns
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none',
col = scalered, margin = c(5, 5), density.info = "none", RowSideColors = rowcols,
RowSideColorsSize=2, main = "Cluster specific peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)),
col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
Instead of all of the above commands to obtain the clustered and processed data, we can simply use the scABC command to do all of the above.
library(GenomicRanges)
library(Rsamtools)
library(WeightedCluster)
peakfile = "NoTreatment/6Lines/mergeAll.tn5.pf.gappedPeak"
bamfile.table = read.table(file = "NoTreatment/6Lines/SRX&Type&Batch.txt")
bamfiles = paste0("bams/", bamfile.table[,1], ".bam")
# main function
InSilicoSCABC = scABC(bamfiles, peakfile)
We’ll subset a selection of peaks to look at top 10,000 cluster specific peaks.
#
ClusterSpecificPeaksV2 = head(order(apply(InSilicoSCABC$PeakPVals, 1, min),
decreasing = FALSE), 10000)
length(intersect(ClusterSpecificPeaks, ClusterSpecificPeaksV2))
## [1] 10000
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoSCABC$cluster_assignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
x = t(InSilicoSCABC$ForeGroundMatrix[ClusterSpecificPeaksV2, ])
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none',
col = scalered, margin = c(5, 5), density.info = "none", RowSideColors = rowcols,
RowSideColorsSize=2, main = "Cluster specific peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)),
col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
We can also select cluster specific peaks by taking peaks that are less than some threshhold. We have to set it to be really small because Rmarkdown does not support large matrices.
#
ClusterSpecificPeaksV3 = which(apply(InSilicoSCABC$PeakPVals, 1, min) < 0.0001)
length(ClusterSpecificPeaksV3)
## [1] 19009
# all peaks are cluster specific
scalered <- colorRampPalette(c("white", "red"), space = "rgb")(256)
rcols1 = brewer.pal(6, "Accent")[1:6]
rowcols1 = rcols1[cell.info]
rcols2 = brewer.pal(6, "Dark2")[1:6]
rowcols2 = rcols2[InSilicoSCABC$cluster_assignments]
rowcols = rbind(rowcols1, rowcols2)
rownames(rowcols) = c("rep info", "cluster")
x = t(InSilicoSCABC$ForeGroundMatrix[ClusterSpecificPeaksV3, ])
d = as.dist(1 - cor(x, method = "spearman"))
col.clus = hclust(d, method ="average")
x[which(x > 5)] = 5
heatmap.3(x, dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus),
trace='none', col = scalered, margin = c(5, 5), density.info = "none",
RowSideColors = rowcols, RowSideColorsSize=2, main = "Cluster specific peaks")
legend("bottomleft", legend = c(unique(cell.info), paste0("cluster ", 1:6)),
col = c(rcols1, rcols2), border=FALSE, bty="n", y.intersp = 0.7, cex=0.7, pch = 15)
To identify transcription factors active in cluster specific peaks we’ll use chromVAR (https://greenleaflab.github.io/chromVAR/) to compute TF motifs.
First we’ll look at the naive application of chromVAR without selecting for cluster specific peaks
#
ClusterSpecificPeaks = data.frame(chrom = InSilicoSCABC$peaks$chrom, start = InSilicoSCABC$peaks$start, end = InSilicoSCABC$peaks$end, cluster1pvals = InSilicoSCABC$PeakPVals[,1], cluster2pvals = InSilicoSCABC$PeakPVals[,2], cluster3pvals = InSilicoSCABC$PeakPVals[,3], cluster4pvals = InSilicoSCABC$PeakPVals[,4], cluster5pvals = InSilicoSCABC$PeakPVals[,5], cluster6pvals = InSilicoSCABC$PeakPVals[,6])
dim(ClusterSpecificPeaks)
## [1] 94934 9
write.table(ClusterSpecificPeaks, file = "InSilicoPeaksWithPvals.txt", sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
#BiocInstaller::biocLite("GreenleafLab/chromVAR")
library(chromVAR)
library(motifmatchr)
library(Matrix)
library(SummarizedExperiment)
extra_cols = c(4, 5, 6, 7, 8, 9)
names(extra_cols) = c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6")
peaks = getPeaks("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/InSilicoPeaksWithPvals.txt", extra_cols = extra_cols)
## Warning in getPeaks("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/InSilicoPeaksWithPvals.txt", : Peaks are not equal width!
## Use resize(peaks, width = x, fix = "center") to make peaks equal in size,
## where x is the desired size of the peaks)
## Warning in getPeaks("~/scRNAseqAnalysis/scATAC-RNAseq/scABC/vignettes/
## InSilicoPeaksWithPvals.txt", : Peaks not sorted
# peaks include broad peaks, remove broad peaks
hist(width(peaks), breaks = 200)
length(peaks)
## [1] 94934
peaks = peaks[which(width(peaks) <= 800)]
length(peaks)
## [1] 23566
peaks = resize(peaks, width = 500, fix = "center")
peaks = sort(peaks)
bamfiles = paste0("bams/", colnames(InSilicoSCABC$ForeGroundMatrix))
frag_counts = getCounts(bamfiles, peaks, paired = FALSE,
colData = data.frame(cluster.assigment = factor(InSilicoSCABC$cluster_assignments)))
library(BSgenome.Hsapiens.UCSC.hg19)
frag_counts = addGCBias(frag_counts, genome = BSgenome.Hsapiens.UCSC.hg19)
motifs <- getJasparMotifs()
motifs.matched = matchMotifs(motifs, frag_counts, genome = BSgenome.Hsapiens.UCSC.hg19)
dev = computeDeviations(object = frag_counts, annotations = motifs.matched)
dev.scores = deviationScores(dev)
variability = computeVariability(dev)
plotVariability(variability, use_plotly = TRUE)
variability[head(order(variability$variability, decreasing = TRUE), 20), ]
## name variability bootstrap_lower_bound
## MA0478.1_FOSL2 FOSL2 4.171021 3.824344
## MA0490.1_JUNB JUNB 4.093872 3.770376
## MA0477.1_FOSL1 FOSL1 4.079807 3.739060
## MA0491.1_JUND JUND 4.074336 3.749512
## MA0099.2_FOS::JUN FOS::JUN 4.022065 3.675929
## MA0476.1_FOS FOS 3.975986 3.662666
## MA0489.1_JUN(var.2) JUN(var.2) 3.955445 3.647968
## MA0841.1_NFE2 NFE2 3.248245 2.996394
## MA0655.1_JDP2 JDP2 3.155490 2.907860
## MA0462.1_BATF::JUN BATF::JUN 3.085327 2.875677
## MA0080.4_SPI1 SPI1 2.695883 2.535176
## MA0139.1_CTCF CTCF 2.602619 2.461270
## MA0140.2_GATA1::TAL1 GATA1::TAL1 2.421919 2.309954
## MA0653.1_IRF9 IRF9 2.393844 2.281333
## MA0808.1_TEAD3 TEAD3 2.389406 2.221844
## MA0652.1_IRF8 IRF8 2.356732 2.246050
## MA0090.2_TEAD1 TEAD1 2.171846 2.025194
## MA0517.1_STAT1::STAT2 STAT1::STAT2 2.165937 2.058199
## MA0809.1_TEAD4 TEAD4 2.134771 1.983942
## MA0501.1_MAF::NFE2 MAF::NFE2 2.038094 1.898431
## bootstrap_upper_bound p_value p_value_adj
## MA0478.1_FOSL2 4.526154 0 0
## MA0490.1_JUNB 4.446734 0 0
## MA0477.1_FOSL1 4.427269 0 0
## MA0491.1_JUND 4.430908 0 0
## MA0099.2_FOS::JUN 4.368081 0 0
## MA0476.1_FOS 4.306116 0 0
## MA0489.1_JUN(var.2) 4.292330 0 0
## MA0841.1_NFE2 3.512678 0 0
## MA0655.1_JDP2 3.419215 0 0
## MA0462.1_BATF::JUN 3.323949 0 0
## MA0080.4_SPI1 2.843321 0 0
## MA0139.1_CTCF 2.752233 0 0
## MA0140.2_GATA1::TAL1 2.529015 0 0
## MA0653.1_IRF9 2.506730 0 0
## MA0808.1_TEAD3 2.563882 0 0
## MA0652.1_IRF8 2.472861 0 0
## MA0090.2_TEAD1 2.317526 0 0
## MA0517.1_STAT1::STAT2 2.267102 0 0
## MA0809.1_TEAD4 2.280900 0 0
## MA0501.1_MAF::NFE2 2.183571 0 0
tsne = deviationsTsne(dev, threshold = 1, perplexity = 10)
tsne_plot = plotDeviationsTsne(dev, tsne, annotation = "FOSL2",
sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[1]]
tsne_plot[[2]]
tsne_plot = plotDeviationsTsne(dev, tsne, annotation = "CTCF",
sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[2]]
tsne_plot = plotDeviationsTsne(dev, tsne, annotation = "GATA1::TAL1",
sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[2]]
top_motifs = variability$name[head(order(variability$variability, decreasing = TRUE), 20)]
names(top_motifs) = rownames(variability[head(order(variability$variability, decreasing = TRUE), 20), ])
top_devs = dev.scores[which(rownames(dev.scores) %in% names(top_motifs)), ]
rownames(top_devs) = top_motifs[match(rownames(top_devs), names(top_motifs))]
library(gplots)
library(RColorBrewer)
scalebluered <- colorRampPalette(c("blue", "white", "red"), space = "rgb")(256)
cols = brewer.pal(6, "Dark2")[1:6]
cols = cols[c(6, 1, 4, 2, 5, 3)] # reorder to agree with Zhi's figure
rowcols = cols[InSilicoSCABC$cluster_assignments]
library(vegan)
d = vegdist(top_devs, method = "euclidean")
col.clus = hclust(d, "centroid")
heatmap.2(t(top_devs), dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none', col = scalebluered, density.info = "none", RowSideColors = rowcols, margin = c(10, 1))
Most of the TFs are BJ specific. Let’s look at when we take cluster specific peaks, but an equal number of each cluster.
topPeaks = peaks[which(apply(elementMetadata(peaks), 1, min) < 1e-5)]
fragCountsTopPeaks = getCounts(bamfiles, topPeaks, paired = FALSE,
colData = data.frame(cluster.assigment = factor(InSilicoSCABC$cluster_assignments)))
fragCountsTopPeaks = addGCBias(fragCountsTopPeaks, genome = BSgenome.Hsapiens.UCSC.hg19)
motifs <- getJasparMotifs()
motifsTopPeaks.matched = matchMotifs(motifs, fragCountsTopPeaks, genome = BSgenome.Hsapiens.UCSC.hg19)
devTopPeaks = computeDeviations(object = fragCountsTopPeaks, annotations = motifsTopPeaks.matched)
devTopPeaks.scores = deviationScores(devTopPeaks)
variabilityTopPeaks = computeVariability(devTopPeaks)
plotVariability(variabilityTopPeaks, use_plotly = TRUE)
variabilityTopPeaks[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20), ]
## name variability bootstrap_lower_bound
## MA0478.1_FOSL2 FOSL2 2.268453 2.094568
## MA0490.1_JUNB JUNB 2.219077 2.050912
## MA0099.2_FOS::JUN FOS::JUN 2.217914 2.033092
## MA0477.1_FOSL1 FOSL1 2.201068 2.039151
## MA0491.1_JUND JUND 2.194246 2.023153
## MA0476.1_FOS FOS 2.139884 1.979456
## MA0489.1_JUN(var.2) JUN(var.2) 2.089581 1.919563
## MA0652.1_IRF8 IRF8 1.834726 1.742883
## MA0140.2_GATA1::TAL1 GATA1::TAL1 1.820301 1.736505
## MA0517.1_STAT1::STAT2 STAT1::STAT2 1.771074 1.690564
## MA0841.1_NFE2 NFE2 1.757561 1.621410
## MA0653.1_IRF9 IRF9 1.718928 1.646433
## MA0655.1_JDP2 JDP2 1.708895 1.571240
## MA0830.1_TCF4 TCF4 1.681157 1.601761
## MA0522.2_TCF3 TCF3 1.652910 1.570793
## MA0462.1_BATF::JUN BATF::JUN 1.650556 1.530218
## MA0778.1_NFKB2 NFKB2 1.613152 1.529105
## MA0050.2_IRF1 IRF1 1.581729 1.503034
## MA0080.4_SPI1 SPI1 1.569772 1.485416
## MA0808.1_TEAD3 TEAD3 1.550953 1.455613
## bootstrap_upper_bound p_value p_value_adj
## MA0478.1_FOSL2 2.426135 0.000000e+00 0.000000e+00
## MA0490.1_JUNB 2.370993 0.000000e+00 0.000000e+00
## MA0099.2_FOS::JUN 2.375615 0.000000e+00 0.000000e+00
## MA0477.1_FOSL1 2.347924 0.000000e+00 0.000000e+00
## MA0491.1_JUND 2.350667 0.000000e+00 0.000000e+00
## MA0476.1_FOS 2.280780 0.000000e+00 0.000000e+00
## MA0489.1_JUN(var.2) 2.236815 0.000000e+00 0.000000e+00
## MA0652.1_IRF8 1.917120 2.491118e-244 1.201964e-242
## MA0140.2_GATA1::TAL1 1.903609 1.399384e-236 6.001801e-235
## MA0517.1_STAT1::STAT2 1.844010 5.457580e-211 2.106626e-209
## MA0841.1_NFE2 1.880238 3.379196e-204 1.185791e-202
## MA0653.1_IRF9 1.792765 2.436011e-185 7.835837e-184
## MA0655.1_JDP2 1.833406 1.403980e-180 4.168742e-179
## MA0830.1_TCF4 1.760106 1.035880e-167 2.856069e-166
## MA0522.2_TCF3 1.724996 4.690590e-155 1.207045e-153
## MA0462.1_BATF::JUN 1.759297 5.072664e-154 1.223780e-152
## MA0778.1_NFKB2 1.694941 5.141825e-138 1.167497e-136
## MA0050.2_IRF1 1.656781 3.405758e-125 7.303459e-124
## MA0080.4_SPI1 1.649027 1.815860e-120 3.689063e-119
## MA0808.1_TEAD3 1.639982 3.378729e-113 6.520947e-112
intersect(variabilityTopPeaks$name[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20)], variability$name[head(order(variability$variability, decreasing = TRUE), 20)])
## [1] "FOSL2" "JUNB" "FOS::JUN" "FOSL1"
## [5] "JUND" "FOS" "JUN(var.2)" "IRF8"
## [9] "GATA1::TAL1" "STAT1::STAT2" "NFE2" "IRF9"
## [13] "JDP2" "BATF::JUN" "SPI1" "TEAD3"
tsneTopPeaks = deviationsTsne(devTopPeaks, threshold = 1, perplexity = 10)
tsneTopPeaks_plot = plotDeviationsTsne(devTopPeaks, tsneTopPeaks, annotation = "FOSL2",
sample_column = "cluster.assigment", shiny = FALSE)
tsne_plot[[1]]
A large number intersect, which is good.
topMotifsTopPeaks = variabilityTopPeaks$name[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20)]
names(topMotifsTopPeaks) = rownames(variabilityTopPeaks[head(order(variabilityTopPeaks$variability, decreasing = TRUE), 20), ])
topDevsTopPeaks = devTopPeaks.scores[which(rownames(devTopPeaks.scores) %in% names(topMotifsTopPeaks)), ]
rownames(topDevsTopPeaks) = topMotifsTopPeaks[match(rownames(topDevsTopPeaks), names(topMotifsTopPeaks))]
topDevsTopPeaks[which(is.na(topDevsTopPeaks))] = 0
library(gplots)
library(RColorBrewer)
scalebluered <- colorRampPalette(c("blue", "white", "red"), space = "rgb")(256)
cols = brewer.pal(6, "Dark2")[1:6]
cols = cols[c(6, 1, 4, 2, 5, 3)] # reorder to agree with Zhi's figure
rowcols = cols[InSilicoSCABC$cluster_assignments]
library(vegan)
d = vegdist(topDevsTopPeaks, method = "euclidean")
col.clus = hclust(d, "centroid")
heatmap.2(t(topDevsTopPeaks), dendrogram='none', Rowv=FALSE, Colv=as.dendrogram(col.clus), trace='none', col = scalebluered, density.info = "none", RowSideColors = rowcols, margin = c(10, 1))
In the cluster specific peaks the Jun family motifs in BJ are still present, but now we see the Gata1-Tal1 motif in K562 cells (see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3431495/ or http://mcb.asm.org/content/25/4/1215.full) and Pou motifs in H1 cells (espcially Pou3f2, aka Oct4).
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] vegan_2.4-4 lattice_0.20-35
## [3] permute_0.9-4 BSgenome.Hsapiens.UCSC.hg19_1.4.0
## [5] BSgenome_1.44.1 rtracklayer_1.36.4
## [7] SummarizedExperiment_1.6.3 DelayedArray_0.2.7
## [9] matrixStats_0.52.2 Biobase_2.36.2
## [11] Matrix_1.2-11 motifmatchr_0.99.8
## [13] chromVAR_0.5.2 RColorBrewer_1.1-2
## [15] gplots_3.0.1 WeightedCluster_1.2-1
## [17] cluster_2.0.6 TraMineR_2.0-7
## [19] Rsamtools_1.28.0 Biostrings_2.44.2
## [21] XVector_0.16.0 GenomicRanges_1.28.5
## [23] GenomeInfoDb_1.12.2 IRanges_2.10.3
## [25] S4Vectors_0.14.4 BiocGenerics_0.22.0
## [27] scABC_0.99.0 devtools_1.13.3
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.13 VGAM_1.0-4
## [3] colorspace_1.3-2 rprojroot_1.2
## [5] htmlTable_1.9 base64enc_0.1-3
## [7] DT_0.2 bit64_0.9-7
## [9] AnnotationDbi_1.38.2 codetools_0.2-15
## [11] splines_3.4.1 R.methodsS3_1.7.1
## [13] knitr_1.17 jsonlite_1.5
## [15] Formula_1.2-2 seqLogo_1.42.0
## [17] annotate_1.54.0 GO.db_3.4.1
## [19] png_0.1-7 R.oo_1.21.0
## [21] shiny_1.0.5 readr_1.1.1
## [23] compiler_3.4.1 httr_1.3.1
## [25] backports_1.1.0 assertthat_0.2.0
## [27] lazyeval_0.2.0 acepack_1.4.1
## [29] htmltools_0.3.6 tools_3.4.1
## [31] bindrcpp_0.2 glue_1.1.1
## [33] gtable_0.2.0 TFMPvalue_0.0.6
## [35] GenomeInfoDbData_0.99.0 reshape2_1.4.2
## [37] dplyr_0.7.3 Rcpp_0.12.12
## [39] nlme_3.1-131 gdata_2.18.0
## [41] crosstalk_1.0.0 CNEr_1.12.1
## [43] stringr_1.2.0 mime_0.5
## [45] miniUI_0.1.1 poweRlaw_0.70.1
## [47] gtools_3.5.0 XML_3.98-1.9
## [49] JASPAR2016_1.4.0 MASS_7.3-47
## [51] zlibbioc_1.22.0 scales_0.5.0
## [53] hms_0.3 yaml_2.1.14
## [55] curl_2.8.1 memoise_1.1.0
## [57] gridExtra_2.3 ggplot2_2.2.1
## [59] rpart_4.1-11 latticeExtra_0.6-28
## [61] stringi_1.1.5 RSQLite_2.0
## [63] checkmate_1.8.3 caTools_1.17.1
## [65] boot_1.3-20 BiocParallel_1.10.1
## [67] rlang_0.1.2 pkgconfig_2.0.1
## [69] bitops_1.0-6 evaluate_0.10.1
## [71] purrr_0.2.3 bindr_0.1
## [73] labeling_0.3 GenomicAlignments_1.12.2
## [75] htmlwidgets_0.9 bit_1.1-12
## [77] plyr_1.8.4 magrittr_1.5
## [79] R6_2.2.2 Hmisc_4.0-3
## [81] DBI_0.7 mgcv_1.8-20
## [83] foreign_0.8-69 withr_2.0.0
## [85] survival_2.41-3 KEGGREST_1.16.1
## [87] RCurl_1.95-4.8 nnet_7.3-12
## [89] tibble_1.3.4 KernSmooth_2.23-15
## [91] plotly_4.7.1 nabor_0.4.7
## [93] rmarkdown_1.6 TFBSTools_1.14.2
## [95] grid_3.4.1 data.table_1.10.4
## [97] blob_1.1.0 git2r_0.19.0
## [99] digest_0.6.12 xtable_1.8-2
## [101] tidyr_0.7.1 httpuv_1.3.5
## [103] R.utils_2.5.0 munsell_0.4.3
## [105] DirichletMultinomial_1.18.0 viridisLite_0.2.0